This implementation only takes in a pandas dataframe and considers only the columns to keep as string objects.

Reasoning for only taking Pandas

Streams of data often come in chunks, (key pairs or otherwise) with no intrinsic ordering, hence assumption of order in the model matrix is probably inappropriate.

Dealing with numpy matrices (to do in the future)

Presume an order which is to be enforced - the easiest way is to:

Check if object is array, if it is:
- Force column names in the logical way (e.g. column0, column1 ... etc)
- Coerce as a dataframe
Proceed as normal



In [1]:

    
import sklearn



In [2]:

    
from sklearn.datasets import make_regression, make_classification
from sklearn.linear_model import SGDRegressor, SGDClassifier

import pandas as pd
import numpy as np



In [3]:

    
X, y = make_classification(n_features=100)
pdf = pd.DataFrame(X)
pdf.columns = ['c{}'.format(x) for x in range(X.shape[1])]



In [4]:

    
X.shape









    Out[4]:





(100, 100)



In [10]:

    
X1 = pdf[['c{}'.format(x) for x in range(50, 100)]]
X2 = pdf[['c{}'.format(x) for x in range(50)]]



In [38]:

    
class GraftingClassifier(SGDClassifier):
    def __init__(self, loss="log", penalty='l2', alpha=0.0001, l1_ratio=0.15,
                 fit_intercept=True, max_iter=None, tol=None, shuffle=True,
                 verbose=0, epsilon=0.1, n_jobs=1,
                 random_state=None, learning_rate="optimal", eta0=0.0,
                 power_t=0.5, class_weight=None, warm_start=False,
                 average=False, n_iter=None, 
                 reg_penalty=None):
        super(GraftingClassifier, self).__init__(
            loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio,
            fit_intercept=fit_intercept, max_iter=max_iter, tol=tol,
            shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs,
            random_state=random_state, learning_rate=learning_rate, eta0=eta0,
            power_t=power_t, class_weight=class_weight, warm_start=warm_start,
            average=average, n_iter=n_iter)
        self.coef_info = {'cols': [], 'coef':[], 'excluded_cols': []}
        self.seen_cols = []
        self.base_shape = None
        self.reg_penalty = reg_penalty if reg_penalty is not None else l1_ratio
        
    def add_column_exclusion(self, cols):
        self.coef_info['excluded_cols'] = list(self.coef_info['excluded_cols']) + list(cols)
    
    def _fit_columns(self, X_, return_x=True, transform_only=False):
        """
        Method filter through "unselected" columns. The goal of this 
        method is to filter any uninformative columns.
        
        This will be selected based on index only?
        
        If return_x is false, it will only return the boolean mask.
        """
        X = X_[X_.columns.difference(self.coef_info['excluded_cols'])]
        
        # order the columns correctly...
        col_order = self.coef_info['cols'] + list([x for x in X.columns if x not in self.coef_info['cols']])
        X = X[col_order]
        return X
        
    def _reg_penalty(self, X):
        col_coef = [(col, coef) for col, coef in zip(X.columns.tolist(), self.coef_.flatten()) if np.abs(coef) >= self.reg_penalty]
        self.coef_info['cols'] = [x for x, _ in col_coef]
        self.coef_info['coef'] = [x for _, x in col_coef]
        self.coef_info['excluded_cols'] = [x for x in self.seen_cols if x not in self.coef_info['cols']]
        self.coef_ = np.array(self.coef_info['coef']).reshape(1, -1) 
            
    def fit(self, X, y, coef_init=None, intercept_init=None,
            sample_weight=None):
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        super(GraftingClassifier, self).fit(X, y, coef_init=coef_init, intercept_init=intercept_init,
            sample_weight=sample_weight)
        self._reg_penalty(X)
        return self
        
    def partial_fit(self, X, y, sample_weight=None):
        X_ = X.copy()
        self.seen_cols = list(set(self.seen_cols + X.columns.tolist()))
        X = X[X.columns.difference(self.coef_info['excluded_cols'])]
        
        # TODO: add the spectral selection here
        # it should only consider "unseen"
        X = self._fit_columns(X)        
        # now update coefficients
        n_samples, n_features = X.shape
        coef_list = np.zeros(n_features, dtype=np.float64, order="C")
        coef_list[:len(self.coef_info['coef'])] = self.coef_info['coef']
        self.coef_ = np.array(coef_list).reshape(1, -1)
        
        super(GraftingClassifier, self).partial_fit(X, y, sample_weight=None)  
        self._reg_penalty(X)
        return self
    
    def predict(self, X):
        X = self._fit_columns(X, transform_only=True)
        return super(GraftingClassifier, self).predict(X)



In [39]:

    
model = GraftingClassifier(max_iter=1000, l1_ratio=1.0)
model.fit(X1, y)









    Out[39]:





GraftingClassifier(alpha=0.0001, average=False, class_weight=None,
          epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=1.0,
          learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
          n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
          reg_penalty=1.0, shuffle=True, tol=None, verbose=0,
          warm_start=False)



In [40]:

    
model.coef_.shape









    Out[40]:





(1, 18)



In [41]:

    
model.partial_fit(pdf, y)









    Out[41]:





GraftingClassifier(alpha=0.0001, average=False, class_weight=None,
          epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=1.0,
          learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
          n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
          reg_penalty=1.0, shuffle=True, tol=None, verbose=0,
          warm_start=False)



In [42]:

    
model.coef_.shape









    Out[42]:





(1, 16)



In [43]:

    
model.coef_info.keys()









    Out[43]:





dict_keys(['excluded_cols', 'coef', 'cols'])



In [44]:

    
len(model.coef_info['cols'])









    Out[44]:





16



In [45]:

    
len(model.coef_info['coef'])









    Out[45]:





16



In [46]:

    
len(model.coef_info['excluded_cols'])









    Out[46]:





84



In [47]:

    
model.predict(pdf)









    Out[47]:





array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1])